library(readr)
library(janitor)
library(dplyr)
library(caret)
library(glmnet)
library(ISLR)
library(pls)
library(splines)
library(boot)
library(ggplot2)
library(gam)
train = read_csv("solubility_train.csv") %>%
clean_names()
test = read_csv("solubility_test.csv") %>%
clean_names()
(a) Fit a linear model using least squares on the training data, and report the test error obtained.
ls.mod = lm(solubility ~ ., data = train)
# predict the new obesrvations from test data
pred.ls = predict(ls.mod, test)
# test set error
mse1 = mean((pred.ls - test$solubility)^2) #0.5558898
The test error using multiple linear regression is 0.5558898.
(b) Fit a ridge regression model on the training data, with λ chosen by cross-validation. Report the test error obtained.
# matrix of predictors
x = model.matrix(solubility ~ ., train)[,-1]
# vector of response
y = train$solubility
# set a grid for lambda
grid.ridge <- seq(0.0001, 10, length = 100)
# fit the ridge regression (alpha = 0) with a sequence of lambdas
ridge.mod = glmnet(x, y, alpha = 0)
# Cross-validation for ridge regression
cv.out = cv.glmnet(x, y, alpha = 0, type.measure = "mse")
# two vertical lines for minimal MSE and 1SE rule
plot(cv.out)
# finding the best lambda
best.lambda1 = cv.out$lambda.min
best.lambda1
## [1] 0.14784
# fit the ridge regression using the best lambda
ridge.mod1 = glmnet(x, y, alpha = 0, lambda = best.lambda1)
# predict the new obesrvations from test data
test_matrix = model.matrix(solubility ~ ., test)[,-1]
pred.ridge = predict(ridge.mod1, newx = test_matrix)
# test set error
mse2 = mean((pred.ridge - test$solubility)^2) # 0.5147015
mse2
## [1] 0.5147447
Using ridge regression to fit the training data, the \(\lambda\) choosing from 10-fold cross validation is 0.14784. The test error is 0.5147447.
(c) Fit a lasso model on the training data, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
grid.lasso = exp(seq(6, -1, length = 100))
lasso.mod = glmnet(x, y, alpha = 1, lambda = grid.lasso)
cv.out = cv.glmnet(x, y, alpha = 1)
plot(cv.out)
best.lambda2 = cv.out$lambda.min
lasso.mod1 = glmnet(x, y, alpha = 1, lambda = best.lambda2)
# number of non-zero coefficient estimates
length(which(coef(cv.out, s = "lambda.min") != 0))
## [1] 141
# test set error
pred.lasso = predict(lasso.mod1, newx = test_matrix)
mse3 = mean((pred.lasso - test$solubility)^2) # 0.4973554
Using lasso regression to fit training data, the \(\lambda\) choosing from 10-fold Cross-Validation is 0.0050716. The test error is 0.4948331, and the number of non-zero coefficients is 141.
(d) Fit a PLS model on the training data, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
pls.fit = plsr(solubility ~ .,
data = train,
scale = TRUE,
validation = "CV")
validationplot(pls.fit, val.type = "MSEP")
pls.RMSEP = RMSEP(pls.fit, estimate = "CV")
plot(pls.RMSEP, main = "RMSEP PLS Solubility", xlab = "components")
min_comp = which.min(pls.RMSEP$val)
points(min_comp, min(pls.RMSEP$val), pch = 1, col = "red", cex = 1.5)
# test set performance
pred.pls = predict(pls.fit, test, ncomp = min_comp)
# test set MSE
mse4 = mean((pred.pls - test$solubility)^2) # 0.5356044
Using partial least square to fit the training data, the number of components chosed from Cross-Validation is 19. Test error is 0.5356044.
(e) Write a brief report containing (i) exploratory analysis of the training data (ii) a discussion on the results obtained in (a)∼(d).
sub_count1 = train[, 209:229] %>%
select(numatoms:numrotbonds, solubility)
sub_count2 = train[, 209:229] %>%
select(numdblbonds:numoxygen, solubility)
sub_count3 = train[, 209:229] %>%
select(numsulfer:numrings, solubility)
sub_continuous = train[, 209:229] %>%
select(-(numatoms:numrings))
pairs(sub_count1)
pairs(sub_count2)
pairs(sub_count3)
pairs(sub_continuous)
# MSE bar chart
mse = c(mse1, mse2, mse3, mse4)
methods = c("least squares", "ridge regression", "lasso", "PLS")
data.frame(methods, mse) %>%
ggplot(aes(x = methods, y = mse)) +
geom_bar(stat = "identity")
# dotchart(mse, labels = methods, cex = .7, xlab = "MSE")
concrete = read_csv("concrete.csv") %>%
clean_names()
(a) Create plots of predictors using the function featurePlot() in the caret package.
featurePlot(x = concrete[, -ncol(concrete)],
y = concrete$compressivestrength,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"))
(b) Perform polynomial regression to predict concrete compressive strength using water as the predictor.
fit = lm(compressivestrength ~ poly(water, degree = 5, raw = TRUE), data = concrete)
summary(fit)
##
## Call:
## lm(formula = compressivestrength ~ poly(water, degree = 5, raw = TRUE),
## data = concrete)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.701 -10.748 -0.006 10.208 49.139
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -6.845e+02 3.479e+03 -0.197
## poly(water, degree = 5, raw = TRUE)1 -2.554e+00 9.907e+01 -0.026
## poly(water, degree = 5, raw = TRUE)2 2.890e-01 1.116e+00 0.259
## poly(water, degree = 5, raw = TRUE)3 -2.990e-03 6.226e-03 -0.480
## poly(water, degree = 5, raw = TRUE)4 1.172e-05 1.719e-05 0.682
## poly(water, degree = 5, raw = TRUE)5 -1.613e-08 1.881e-08 -0.857
## Pr(>|t|)
## (Intercept) 0.844
## poly(water, degree = 5, raw = TRUE)1 0.979
## poly(water, degree = 5, raw = TRUE)2 0.796
## poly(water, degree = 5, raw = TRUE)3 0.631
## poly(water, degree = 5, raw = TRUE)4 0.496
## poly(water, degree = 5, raw = TRUE)5 0.391
##
## Residual standard error: 15.02 on 1024 degrees of freedom
## Multiple R-squared: 0.1953, Adjusted R-squared: 0.1914
## F-statistic: 49.72 on 5 and 1024 DF, p-value: < 2.2e-16
# predict(fit, concrete, type = 'response')
# ANOVA compare models at different degrees
fit.1 = lm(compressivestrength ~ water, data = concrete)
fit.2 = lm(compressivestrength ~ poly(water, 2), data = concrete)
fit.3 = lm(compressivestrength ~ poly(water, 3), data = concrete)
fit.4 = lm(compressivestrength ~ poly(water, 4), data = concrete)
fit.5 = lm(compressivestrength ~ poly(water, 5), data = concrete)
a <- anova(fit.1, fit.2, fit.3, fit.4, fit.5) # choose degree=4
a$`Pr(>F)`
## [1] NA 4.695569e-16 4.196616e-13 1.426268e-05 3.914561e-01
# Use cross-validation to select the optimal degree d for the polynomial.
cv.error = rep(0,10)
for (i in 1:10) {
glm.fit = glm(compressivestrength~poly(water, i), data = concrete)
cv.error[i] = cv.glm(concrete, glm.fit)$delta[1]
}
plot(cv.error)
poly_mod = fit.4
Using ANOVA to compare nested models: The p-value of comparing the linear Model 1 to the quadratic Model 2 is essentially zero(< 10−15), indicating that a linear fit is not sufficient. Similarly the p-value comparing the Model 2 to the Model 3 is very low(4.197e-13), indicating model 3 is superior than model 2. In the same way, model 4 is superior than model 3. However, Model 5 seems unnecessary because its p-value is 0.3915. Hence I picked d=4 as the optimal degree.
LOOCV cross-validation is also used to pick the optimal degree corresponding to the suitable test error. There’s a clear sharp drop in the estimated test MSE between the linear and 4th degree fits, but no clear improvement after d=4, with fluctuation from using higher-order polynomials. So after running the LOOCV cross-validation, the optimal degree of d I chose is still 4, which is the same as the optimal degree of d that I chose from the hypothesis testing using ANOVA.
Make a plot of the resulting polynomial fit to the data
waterlims = range(concrete$water)
water.grid = seq(from = waterlims[1], to = waterlims[2])
pred = predict(poly_mod, newdata = list(water = water.grid), se = TRUE)
plot(concrete$water, concrete$compressivestrength, col = "gray")
lines(water.grid, pred$fit, lwd = 2)
(c) Fit a regression spline using water as the predictor for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.
rss = rep(NA, 14)
for (i in 3:14) {
fit = lm(compressivestrength ~ bs(water, df = i), data = concrete)
rss[i] = sum(fit$residuals^2)
}
plot(3:14, rss[-c(1, 2)], xlab = "Degrees of freedom", ylab = "RSS", type = "l")
fit = smooth.spline(concrete$water,concrete$compressivestrength,df = 13)
fit2 = smooth.spline(concrete$water,concrete$compressivestrength, cv = TRUE)
## Warning in smooth.spline(concrete$water, concrete$compressivestrength, cv =
## TRUE): cross-validation with non-unique 'x' values seems doubtful
rss1 <- sum(residuals(fit)^2)
rss2 <- sum(residuals(fit2)^2)
plot(concrete$water,concrete$compressivestrength,xlim = range(concrete$water) ,cex = .5,col = "darkgrey")
title(" Smoothing Spline ")
lines(fit,col = "red",lwd = 2)
lines(fit2,col = "green",lwd = 2)
legend('topright', legend = c('13 DF', '8.04 DF'),
col = c('red','green'), lty = 1, lwd = 2, cex = 0.8)
I tested the degrees of freedom from 3 to 14. There’s a clear trend for the RSS to be decreasing and increasing again around df of 13. Then, I fit the model with degree of freedom of 13. As shown in the plot, the trends for df = 13 is very similar with that of df = 8.04 which was chosen by the Cross-Validation proess. This concludes that my pick was a good one.
(d) Fit a GAM on the training data. Plot the results and explain your findings.
gam1 = gam(compressivestrength ~ s(water) + s(age) + s(fineaggregate) + s(cement) + s(blastfurnaceslag) + s(superplasticizer) + s(coarseaggregate) + s(flyash), data = concrete)
summary(gam1)
##
## Call: gam(formula = compressivestrength ~ s(water) + s(age) + s(fineaggregate) +
## s(cement) + s(blastfurnaceslag) + s(superplasticizer) + s(coarseaggregate) +
## s(flyash), data = concrete)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -25.30117 -3.99055 0.00221 3.86744 27.90102
##
## (Dispersion Parameter for gaussian family taken to be 40.054)
##
## Null Deviance: 287175.2 on 1029 degrees of freedom
## Residual Deviance: 39933.81 on 996.9998 degrees of freedom
## AIC: 6758.408
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(water) 1 14933 14933 372.8214 < 2.2e-16 ***
## s(age) 1 46609 46609 1163.6559 < 2.2e-16 ***
## s(fineaggregate) 1 23603 23603 589.2843 < 2.2e-16 ***
## s(cement) 1 54568 54568 1362.3687 < 2.2e-16 ***
## s(blastfurnaceslag) 1 28270 28270 705.7901 < 2.2e-16 ***
## s(superplasticizer) 1 900 900 22.4736 2.440e-06 ***
## s(coarseaggregate) 1 366 366 9.1408 0.002564 **
## s(flyash) 1 2032 2032 50.7428 2.015e-12 ***
## Residuals 997 39934 40
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(water) 3 36.16 < 2.2e-16 ***
## s(age) 3 454.35 < 2.2e-16 ***
## s(fineaggregate) 3 18.20 1.677e-11 ***
## s(cement) 3 11.99 1.026e-07 ***
## s(blastfurnaceslag) 3 10.34 1.043e-06 ***
## s(superplasticizer) 3 24.79 1.776e-15 ***
## s(coarseaggregate) 3 2.07 0.102976
## s(flyash) 3 5.14 0.001574 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mar = c(4,3,3,3))
par(mfrow = c(2,4))
plot(gam1, se = TRUE,col = "blue")
gam2 = gam(compressivestrength ~ s(blastfurnaceslag) + s(flyash) + s(superplasticizer) + s(coarseaggregate) + s(fineaggregate) + s(age) + s(water), data = concrete)
gam3 = gam(compressivestrength ~ cement + s(blastfurnaceslag) + s(flyash) + s(superplasticizer) + s(coarseaggregate) + s(fineaggregate) + s(age) + s(water), data = concrete)
anova(gam2, gam3, gam1, test = "F")
## Analysis of Deviance Table
##
## Model 1: compressivestrength ~ s(blastfurnaceslag) + s(flyash) + s(superplasticizer) +
## s(coarseaggregate) + s(fineaggregate) + s(age) + s(water)
## Model 2: compressivestrength ~ cement + s(blastfurnaceslag) + s(flyash) +
## s(superplasticizer) + s(coarseaggregate) + s(fineaggregate) +
## s(age) + s(water)
## Model 3: compressivestrength ~ s(water) + s(age) + s(fineaggregate) +
## s(cement) + s(blastfurnaceslag) + s(superplasticizer) + s(coarseaggregate) +
## s(flyash)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 1001 58392
## 2 1000 41134 1 17258.1 430.8708 < 2.2e-16 ***
## 3 997 39934 3 1200.3 9.9888 1.725e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The first few plots showed that the cement data has a strong linear trend. Hence, I compared three models with the linear function of cement and exclusion of cement, respectively.
After doing the ANOVA test and model summary, the term s(coarseaggregate) is not statistically significant for either parametric or non-parametric effects since the p-value is too large at 0.102975. And it is statistical significant to conclude that the s(cement) is better than the linear cement term with a p-value of 1.725e-06. Hence, I conclude that the model with s(cement) is best among the three models.